
#ifndef __GMS_FDIFF1246_AVX512_HPP__
#define __GMS_FDIFF1246_AVX512_HPP__ 100220221224



namespace file_version {

    const unsigned int GMS_FDIFF_AVX512_MAJOR = 1U;
    const unsigned int GMS_FDIFF_AVX512_MINOR = 0U;
    const unsigned int GMS_FDIFF_AVX512_MICRO = 0U;
    const unsigned int GMS_FDIFF_AVX512_FULLVER =
      1000U*GMS_FDIFF_AVX512_MAJOR+
      100U*GMS_FDIFF_AVX512_MINOR+
      10U*GMS_FDIFF_AVX512_MICRO;
    const char * const GMS_FDIFF_AVX512_CREATION_DATE = "10-02-2022 12:24 PM +00200 (THR 10 FEB 2022 GMT+2)";
    const char * const GMS_DERIVATIVE_AVX512_BUILD_DATE    = __DATE__ ":" __TIME__;
    const char * const GMS_DERIVATIVE_AVX512_AUTHOR        = "Programmer: Bernard Gingold, contact: beniekg@gmail.com";
    const char * const GMS_DERIVATIVE_AVX512_DESCRIPTION   = "Vectorized (AVX512) finite difference implementation."

}

   /*
           Based on Boost: /boost/math/tools/numerical_differentiation.hpp
           Manually vectorized and slightly modified in order to retain
           the SIMD performance.
          //  (C) Copyright Nick Thompson 2018.
          //  Use, modification and distribution are subject to the
          //  Boost Software License, Version 1.0. (See accompanying file
          //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

          Boost Software License - Version 1.0 - August 17th, 2003
          Permission is hereby granted, free of charge, to any person or organization
          obtaining a copy of the software and accompanying documentation covered by
          this license (the "Software") to use, reproduce, display, distribute,
          execute, and transmit the Software, and to prepare derivative works of the
          Software, and to permit third-parties to whom the Software is furnished to
          do so, all subject to the following:

          The copyright notices in the Software and this entire statement, including
          the above license grant, this restriction and the following disclaimer,
          must be included in all copies of the Software, in whole or in part, and
          all derivative works of the Software, unless such copies or derivative
          works are solely in the form of machine-executable object code generated by
          a source language processor.

          THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
          IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
          FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
          SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
          FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
          ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
          DEALINGS IN THE SOFTWARE.
    */

    /*
   MIT License

Copyright (c) 2020 Bernard Gingold

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/


#include <immintrin.h>
#include <limits>
#include "GMS_config.h"


namespace gms {

      namespace math {

#define CORRECT_XH_SLOW 0

#if (CORRECT_XH_SLOW) == 1
#include <cmath>
                        __ATTR_ALWAYS_INLINE__
                        __ATTR_HOT__
                        __ATTR_ALIGN__(32)
			__ATTR_VECTORCALL__
	                static inline
			__m512
			correct_xh_zmm16r4(const __m512 x,
			                       __m512 & h) {
                            
			    const __m512 _0     = _mm512_setzero_ps();
			    const __mmask16 eq0 = 0x0;
			    const __m512 zmm0   =  _mm512_add_ps(x,h);
			    h                   = _mm512_sub_ps(zmm0,x);
			    eq0                 = _mm512_cmp_ps_mask(h,_0,_CMP_EQ_OQ);
			    if(eq0) {
			       const float fmax = std::numeric_limits<float>::max();
                               __ATTR_ALIGN__(64) float v[16];
			       _mm512_store_ps(&v[0],x);
			       h = _mm512_setr_ps(std::nextafter(v[0],fmax)-v[0],
			                          std::nextafter(v[1],fmax)-v[1],
						  std::nextafter(v[2],fmax)-v[2],
						  std::nextafter(v[3],fmax)-v[3],
						  std::nextafter(v[4],fmax)-v[4],
						  std::nextafter(v[5],fmax)-v[5],
			                          std::nextafter(v[6],fmax)-v[6],
						  std::nextafter(v[7],fmax)-v[7],
						  std::nextafter(v[8],fmax)-v[8],
						  std::nextafter(v[9],fmax)-v[9],
						  std::nextafter(v[10],fmax)-v[10],
						  std::nextafter(v[11],fmax)-v[11],
						  std::nextafter(v[12],fmax)-v[12],
						  std::nextafter(v[13],fmax)-v[13],
						  std::nextafter(v[14],fmax)-v[14],
						  std::nextafter(v[15],fmax)-v[15])
			    }
			    return (h);
			    
		       }


		        __ATTR_ALWAYS_INLINE__
                        __ATTR_HOT__
                        __ATTR_ALIGN__(32)
			__ATTR_VECTORCALL__
	                static inline
			__m512d
			correct_xh_zmm8r8(const __m512d x,
			                       __m512d & h) {
                            
			    const __m512d _0     = _mm512_setzero_pd();
			    const __mmask8 eq0 = 0x0;
			    const __m512d zmm0   = _mm512_add_pd(x,h);
			    h                   = _mm512_sub_pd(zmm0,x);
			    eq0                 = _mm512_cmp_pd_mask(h,_0,_CMP_EQ_OQ);
			    if(eq0) {
			       const double fmax = std::numeric_limits<double>::max();
                               __ATTR_ALIGN__(64) double v[8];
			       _mm512_store_pd(&v[0],x);
			       h = _mm512_setr_pd(std::nextafter(v[0],fmax)-v[0],
			                          std::nextafter(v[1],fmax)-v[1],
						  std::nextafter(v[2],fmax)-v[2],
						  std::nextafter(v[3],fmax)-v[3],
						  std::nextafter(v[4],fmax)-v[4],
						  std::nextafter(v[5],fmax)-v[5],
			                          std::nextafter(v[6],fmax)-v[6],
						  std::nextafter(v[7],fmax)-v[7]);
						
			    }
			    return (h);
			    
		       }

#else

                        __ATTR_ALWAYS_INLINE__
                        __ATTR_HOT__
                        __ATTR_ALIGN__(32)
			__ATTR_VECTORCALL__
	                static inline
			__m512
			correct_xh_zmm16r4(const __m512 x,
			                       __m512 & h) {
					       
                            const __m512 _0     = _mm512_setzero_ps();
			    const __mmask16 eq0 = 0x0;
                            const __m512 tiny   = _mm512_set1_ps(std::numeric_limits<float>::epsilon());
			    const __m512 zmm0   = _mm512_add_ps(x,h);
			    h                   = _mm512_sub_ps(zmm0,x);
			    eq0                 = _mm512_cmp_ps_mask(h,_0,_CMP_EQ_OQ);
			    h                   = _mm512_mask_blend_ps(eq0,h,_mm512_add_ps(h,tiny));
			    return (h);
		       }


		        __ATTR_ALWAYS_INLINE__
                        __ATTR_HOT__
                        __ATTR_ALIGN__(32)
			__ATTR_VECTORCALL__
	                static inline
			__m512d
			correct_xh_zmm8r8(const __m512d x,
			                       __m512d & h) {
					       
                            const __m512d _0     = _mm512_setzero_pd();
			    const __mmask8 eq0   = 0x0;
                            const __m512d tiny   = _mm512_set1_pd(std::numeric_limits<double>::epsilon());
			    const __m512d zmm0   = _mm512_add_pd(x,h);
			    h                   = _mm512_sub_pd(zmm0,x);
			    eq0                 = _mm512_cmp_pd_mask(h,_0,_CMP_EQ_OQ);
			    h                   = _mm512_mask_blend_pd(eq0,h,_mm512_add_pd(h,tiny));
			    return (h);
		       }

#endif
		        __ATTR_ALWAYS_INLINE__
                        __ATTR_HOT__
                        __ATTR_ALIGN__(32)
			__ATTR_VECTORCALL__
	                static inline
			__m512
			finite_diff_1_zmm16r4(__m512(*f)(__m512),
			                      const __m512 x,
					      __m512 &err,
					      const bool cmperr) {

			    // Error bound ~eps^1/2
                            // Note that this estimate of h differs from the best estimate by a factor of sqrt((|f(x)| + |f(x+h)|)/|f''(x)|).
                            // Since this factor is invariant under the scaling f -> kf, then we are somewhat justified in approximating it by 1.
                            // This approximation will get better as we move to higher orders of accuracy.
			    const __m512 veps = _mm512_set1_ps(std::numeric_limits<float>::epsilon());
			    const __m512 _2   = _mm512_set1_ps(2.0f);
			    const __m512 _0_5 = _mm512_set1_ps(0.5f);
			    __mmask16    eq0  = 0x0;
			    __m512 h          = _mm512_mul_ps(_2,_mm512_sqrt_ps(veps));
			    h                 = correct_xh_zmm16r4(x,h);
			    const __m512 yh   = f(_mm512_add_ps(x,h);
			    const __m512 y0   = f(x);
			    const __m512 diff = _mm512_sub_ps(yh,y0);
			    if(cmperr) {
                               const __m512 ym   = f(_mm512_sub_ps(x,h);
			       const __m512 t0   = _mm512_fmadd_ps(_mm512_sub_ps(yh,_2),y0,ym);
			       const __m512 ypph = _mm512_div_ps(_mm512_abs_ps(t0),h);
			       const __m512 t1   = _mm512_mul_ps(ypph,_0_5);
			       const __m512 t2   = _mm512_div_ps(_mm512_fmadd_ps(_mm512_abs_ps(yh),
			                                           _mm512_abs_ps(y0),veps),h);
			       err               = _mm512_add_ps(t1,t2);
			    }
			    return (_mm512_div_ps(diff,h));
			}


		        __ATTR_ALWAYS_INLINE__
                        __ATTR_HOT__
                        __ATTR_ALIGN__(32)
			__ATTR_VECTORCALL__
	                static inline
			__m512
			finite_diff_1_zmm8r8(__m512d (*f)(__m512d),
			                      const __m512d x,
					      __m512d &err,
					      const bool cmperr) {

			    // Error bound ~eps^1/2
                            // Note that this estimate of h differs from the best estimate by a factor of sqrt((|f(x)| + |f(x+h)|)/|f''(x)|).
                            // Since this factor is invariant under the scaling f -> kf, then we are somewhat justified in approximating it by 1.
                            // This approximation will get better as we move to higher orders of accuracy.
			    const __m512d veps = _mm512_set1_pd(std::numeric_limits<double>::epsilon());
			    const __m512d _2   = _mm512_set1_pd(2.0);
			    const __m512d _0_5 = _mm512_set1_pd(0.5);
			    __mmask8    eq0  = 0x0;
			    __m512d h          = _mm512_mul_ps(_2,_mm512_sqrt_pd(veps));
			    h                  = correct_xh_zmm8r8(x,h);
			    const __m512d yh   = f(_mm512_add_pd(x,h);
			    const __m512d y0   = f(x);
			    const __m512d diff = _mm512_sub_pd(yh,y0);
			    if(cmperr) {
                               const __m512d ym   = f(_mm512_sub_pd(x,h);
			       const __m512d t0   = _mm512_fmadd_pd(_mm512_sub_pd(yh,_2),y0,ym);
			       const __m512d ypph = _mm512_div_pd(_mm512_abs_pd(t0),h);
			       const __m512d t1   = _mm512_mul_pd(ypph,_0_5);
			       const __m512d t2   = _mm512_div_pd(_mm512_fmadd_pd(_mm512_abs_pd(yh),
			                                           _mm512_abs_pd(y0),veps),h);
			       err               = _mm512_add_pd(t1,t2);
			    }
			    return (_mm512_div_pd(diff,h));
			}


			__ATTR_ALWAYS_INLINE__
                        __ATTR_HOT__
                        __ATTR_ALIGN__(32)
			__ATTR_VECTORCALL__
	                static inline
			__m512
			finite_diff_2_zmm16r4(__m512(*f)(__m512),
			                      const __m512 x,
					      __m512 &err,
					      const bool cmperr) {

                            const __m512 veps = _mm512_set1_ps(std::numeric_limits<float>::epsilon());
			    const __m512 veps3= _mm512_mul_ps(veps,_mm512_set1_ps(3.0f));
			    const __m512 third= _mm512_set1_ps(0.3333333333333333333333333333333333f);
			    __m512 h          = _mm512_pow_ps(veps3,third);
			    h                 = correct_xh_zmm16r4(x,h);
			    const __m512 two  = _mm512_set1_ps(2.0f);
			    const __m512 yh   = f(_mm512_add_ps(x,h));
			    const __m512 ymh  = f(_mm512_sub_ps(x,h));
			    const __m512 diff = _mm512_sub_ps(yh,ymh);
			    if(cmperr) {
                               const __m512 half = _mm512_set1_ps(0.5f);
			       
			       const __m512 six  = _mm512_set1_ps(6.0f);
			       const __m512 h2   = _mm512_mul_ps(two,h);
			       const __m512 h6   = _mm512_mul_ps(two,six);
			       const __m512 yth  = f(_mm512_add_ps(x,h2));
			       const __m512 t0   = _mm512_add_ps(_mm512_abs_ps(yh),
			                                         _mm512_abs_ps(ymh));
			       const __m512 t1   = _mm512_mul_ps(veps,_mm512_div_ps(t0,h2));
			       const __m512 ymth = f(_mm512_sub_ps(x,h2));
			       const __m512 t2   = _mm512_abs_ps(_mm512_sub_ps(yth,ymth));
			       const __m512 t3   = _mm512_mul_ps(half,_mm512_sub_ps(two,diff));
			       const __m512 t4   = _mm512_div_ps(t3,t2);
			       err               = _mm512_add_ps(t1,t4);
			    }
			    return (_mm512_div_ps(diff,_mm512_mul_ps(two,h)));
		       }


		       	__ATTR_ALWAYS_INLINE__
                        __ATTR_HOT__
                        __ATTR_ALIGN__(32)
			__ATTR_VECTORCALL__
	                static inline
			__m512d
			finite_diff_2_zmm8r8(__m512d(*f)(__m512d),
			                      const __m512d x,
					      __m512d &err,
					      const bool cmperr) {

                            const __m512d veps = _mm512_set1_pd(std::numeric_limits<double>::epsilon());
			    const __m512d veps3= _mm512_mul_pd(veps,_mm512_set1_ps(3.0));
			    const __m512d third= _mm512_set1_pd(0.3333333333333333333333333333333333);
			    __m512d h          = _mm512_pow_pd(veps3,third);
			    h                 = correct_xh_zmm8r8(x,h);
			    const __m512d two  = _mm512_set1_pd(2.0);
			    const __m512d yh   = f(_mm512_add_pd(x,h));
			    const __m512d ymh  = f(_mm512_sub_pd(x,h));
			    const __m512d diff = _mm512_sub_pd(yh,ymh);
			    if(cmperr) {
                               const __m512d half = _mm512_set1_pd(0.5);
			       
			       const __m512d six  = _mm512_set1_pd(6.0);
			       const __m512d h2   = _mm512_mul_pd(two,h);
			       const __m512d h6   = _mm512_mul_pd(two,six);
			       const __m512d yth  = f(_mm512_add_pd(x,h2));
			       const __m512d t0   = _mm512_add_pd(_mm512_abs_pd(yh),
			                                         _mm512_abs_pd(ymh));
			       const __m512d t1   = _mm512_mul_pd(veps,_mm512_div_pd(t0,h2));
			       const __m512d ymth = f(_mm512_sub_pd(x,h2));
			       const __m512d t2   = _mm512_abs_pd(_mm512_sub_pd(yth,ymth));
			       const __m512d t3   = _mm512_mul_pd(half,_mm512_sub_pd(two,diff));
			       const __m512d t4   = _mm512_div_pd(t3,t2);
			       err               = _mm512_add_pd(t1,t4);
			    }
			    return (_mm512_div_pd(diff,_mm512_mul_pd(two,h)));
		       }


		       	__ATTR_ALWAYS_INLINE__
                        __ATTR_HOT__
                        __ATTR_ALIGN__(32)
			__ATTR_VECTORCALL__
	                static inline
			__m512
			finite_diff_4_zmm16r4(__m512(*f)(__m512),
			                      const __m512 x,
					      __m512 &err,
					      const bool cmperr) {

			     const __m512 veps   = _mm512_set1_ps(std::numeric_limits<float>::epsilon());
			     const __m512 two    = _mm512_set1_ps(2.0f);
			     const __m512 veps11 = _mm512_mul_ps(_mm512_set1_ps(11.25f),veps);
			     const __m512 fifth  = _mm512_set1_ps(0.2f);
			     __m512       h      = _mm512_pow_ps(veps11,fifth);
			     h                   = correct_xh_zmm16r4(x,h);
			     const __m512 h2     = _mm512_mul_ps(two,h);
			     const __m512 h12    = _mm512_mul_ps(_mm512_set1_ps(12.0f),h);
			     const __m512 half   = _mm512_set1_ps(0.5f);
			     const __m512 ymth   = f(_mm512_sub_ps(x,h2));
			     const __m512 yth    = f(_mm512_add_ps(x,h2));
			     const __m512 yh     = f(_mm512_add_ps(x,h));
			     const __m512 ymh    = f(_mm512_sub_ps(x,h));
			     const __m512 y2     = _mm512_sub_ps(ymth,yth);
			     const __m512 y1     = _mm512_sub_ps(yh,ymh);
			     if(cmperr) {
			        const __m512 five= _mm512_set1_ps(5.0f);
                                const __m512 h3  = _mm512_mul_ps(_mm512_set1_ps(3.0f),h);
				const __m512 h30 = _mm512_mul_ps(_mm512_set1_ps(30.0f),h);
				const __m512 y3h = f(_mm512_add_ps(x,h3));
				const __m512 ym3h= f(_mm512_sub_ps(x,h3));
				const __m512 t0  = _mm512_mul_ps(_mm512_sub_ps(y3h,ym3h),half);
			        const __m512 t1  = _mm512_mul_ps(_mm512_sub_ps(ymth,yth),half);                      
				const __m512 t2  = _mm512_mul_ps(five,_mm512_mul_ps(half,_mm512_sub_ps(yh,ymh)));
				err              = _mm512_div_ps(_mm512_abs_ps(_mm512_add_ps(t0,
				                                               _mm512_add_ps(t1,t2))),h30);
				t0               = _mm512_add_ps(_mm512_abs_ps(yth),
				                                 _mm512_abs_ps(ymth));
				t1               = _mm512_mul_ps(_mm512_set1_ps(8.0f),
				                                 _mm512_add_ps(_mm512_abs_ps(ymh),
								               _mm512_abs_ps(yh)));
				t2               = _mm512_div_ps(h12,_mm512_add_ps(t0,t1));
				err              = _mm512_add_ps(err,_mm512_mul_ps(veps,t2));
			     }
			     return (_mm512_div_ps(_mm512_fmadd_ps(_mm512_set1_ps(8.0f),y1,y2)));
			                                           
		       }


		        __ATTR_ALWAYS_INLINE__
                        __ATTR_HOT__
                        __ATTR_ALIGN__(32)
			__ATTR_VECTORCALL__
	                static inline
			__m512d
			finite_diff_4_zmm8r8(__m512d(*f)(__m512d),
			                      const __m512d x,
					      __m512d &err,
					      const bool cmperr) {

			     const __m512d veps   = _mm512_set1_pd(std::numeric_limits<double>::epsilon());
			     const __m512d two    = _mm512_set1_pd(2.0);
			     const __m512d veps11 = _mm512_mul_pd(_mm512_set1_pd(11.25),veps);
			     const __m512d fifth  = _mm512_set1_pd(0.2);
			     __m512d       h      = _mm512_pow_pd(veps11,fifth);
			     h                   = correct_xh_zmm8r8(x,h);
			     const __m512d h2     = _mm512_mul_pd(two,h);
			     const __m512d h12    = _mm512_mul_pd(_mm512_set1_pd(12.0),h);
			     const __m512d half   = _mm512_set1_pd(0.5);
			     const __m512d ymth   = f(_mm512_sub_pd(x,h2));
			     const __m512d yth    = f(_mm512_add_pd(x,h2));
			     const __m512d yh     = f(_mm512_add_pd(x,h));
			     const __m512d ymh    = f(_mm512_sub_pd(x,h));
			     const __m512d y2     = _mm512_sub_pd(ymth,yth);
			     const __m512d y1     = _mm512_sub_pd(yh,ymh);
			     if(cmperr) {
			        const __m512d five= _mm512_set1_pd(5.0);
                                const __m512d h3  = _mm512_mul_pd(_mm512_set1_pd(3.0),h);
				const __m512d h30 = _mm512_mul_pd(_mm512_set1_pd(30.0),h);
				const __m512d y3h = f(_mm512_add_pd(x,h3));
				const __m512d ym3h= f(_mm512_sub_pd(x,h3));
				const __m512d t0  = _mm512_mul_pd(_mm512_sub_pd(y3h,ym3h),half);
			        const __m512d t1  = _mm512_mul_pd(_mm512_sub_pd(ymth,yth),half);                      
				const __m512d t2  = _mm512_mul_pd(five,_mm512_mul_pd(half,_mm512_sub_pd(yh,ymh)));
				err              = _mm512_div_pd(_mm512_abs_pd(_mm512_add_pd(t0,
				                                               _mm512_add_pd(t1,t2))),h30);
				t0               = _mm512_add_pd(_mm512_abs_pd(yth),
				                                 _mm512_abs_pd(ymth));
				t1               = _mm512_mul_pd(_mm512_set1_pd(8.0),
				                                 _mm512_add_pd(_mm512_abs_pd(ymh),
								               _mm512_abs_pd(yh)));
				t2               = _mm512_div_pd(h12,_mm512_add_pd(t0,t1));
				err              = _mm512_add_pd(err,_mm512_mul_pd(veps,t2));
			     }
			     return (_mm512_div_pd(_mm512_fmadd_pd(_mm512_set1_pd(8.0),y1,y2)));
			                                           
		       }


		        __ATTR_ALWAYS_INLINE__
                        __ATTR_HOT__
                        __ATTR_ALIGN__(32)
			__ATTR_VECTORCALL__
	                static inline
			__m512
			finite_diff_6_zmm16r4(__m512(*f)(__m512),
			                      const __m512 x,
					      __m512 &err,
					      const bool cmperr) {

			       const __m512  veps   = _mm512_set1_ps(std::numeric_limits<float>::epsilon());
			         // Error bound ~eps^6/7
                                 // Error: h^6f^(7)(x)/140 + 5|f(x)|eps/h
			       const __m512  two    = _mm512_set1_ps(2.0f);
			       const __m512  three  = _mm512_set1_ps(3.0f);
			       const __m512  four   = _mm512_set1_ps(4.0f);
			       const __m512  nine   = _mm512_set1_ps(9.0f);
			       const __m512  frtfv  = _mm512_set1_ps(45.0f);
			       const __m512  sixty  = _mm512_set1_ps(60.0f);
			       const __m512  svnth  = _mm512_set1_ps(0.1428571428571428571429f);
			       const __m512  epsr   = _mm512_div_ps(veps,_mm512_set1_ps(168.0f));
			       __m512        h      = _mm512_pow_ps(epsr,svnth);
			       h                    = correct_xh_zmm16r4(x,h);
			      
			       const __m512 h4      = _mm512_mul_ps(four,h);
			       const __m512 yh      = f(_mm512_add_ps(x,h));
			       const __m512 h2      = _mm512_mul_ps(two,h);
			       const __m512 ymh     = f(_mm512_sub_ps(x,h));
			       const __m512 h3      = _mm512_mul_ps(three,h);
			       const __m512 y1      = _mm512_sub_ps(yh,ymh);
			       const __m512 y2      = _mm512_sub_ps(f(_mm512_sub_ps(x,h2)),
			                                            f(_mm512_add_ps(x,h2)));
			       const __m512 y3      = _mm512_sub_ps(f(_mm512_add_ps(x,h3)),
			                                            f(_mm512_sub_ps(x,h3)));
			       if(cmperr) {

                                   // Mathematica code to generate fd scheme for 7th derivative:
                                   // Sum[(-1)^i*Binomial[7, i]*(f[x+(3-i)*h] + f[x+(4-i)*h])/2, {i, 0, 7}]
                                   // Mathematica to demonstrate that this is a finite difference formula for 7th derivative:
                                   // Series[(f[x+4*h]-f[x-4*h] + 6*(f[x-3*h] - f[x+3*h]) + 14*(f[x-h] - f[x+h] + f[x+2*h] - f[x-2*h]))/2, {h, 0, 15}]
				   const __m512 half= _mm512_set1_ps(0.5f);
				   
				   const __m512 y4  = _mm512_sub_ps(f(_mm512_add_ps(x,h4)),
				                                    f(_mm512_sub_ps(x,h4)));
				   const __m512 t0  = _mm512_fmsub_ps(_mm512_set1_ps(6.0f),y3,
				                                      _mm512_set1_ps(14.0f));
				   const __m512 t1  = _mm512_fmsub_ps(t0,y1,_mm512_mul_ps(
				                                            _mm512_set1_ps(14.0f),y2));
				   const __m512 y7  = _mm512_mul_ps(half,_mm512_sub_ps(y4,t1));
				   const __m512 t2  = _mm512_div_ps(_mm512_abs_ps(y7),
				                                    _mm512_mul_ps(_mm512_set1_ps(140.0f),h));
				   
				   const __m512 t3  = _mm512_div_ps(_mm512_mul_ps(_mm512_set1_ps(5.0f),
				                                    _mm512_mul_ps(_mm512_add_ps(
								                  _mm512_abs_ps(yh),
										  _mm512_abs_ps(yhm)),veps)),h);
				   err              = _mm512_add_ps(t2,t3);
				                                    
			       }                

			       const __m512 t0 = _mm512_mul_ps(sixty,h);
			       const __m512 t1 = _mm512_fmadd_ps(_mm512_fmadd_ps(y3,nine,y2),frtfv,y1);
			       return (_mm512_div_ps(t1,t0));
			       
		       }


		        __ATTR_ALWAYS_INLINE__
                        __ATTR_HOT__
                        __ATTR_ALIGN__(32)
			__ATTR_VECTORCALL__
	                static inline
			__m512d
			finite_diff_6_zmm8r8(__m512d(*f)(__m512d),
			                      const __m512d x,
					      __m512d &err,
					      const bool cmperr) {

			       const __m512d  veps   = _mm512_set1_ps(std::numeric_limits<double>::epsilon());
			         // Error bound ~eps^6/7
                                 // Error: h^6f^(7)(x)/140 + 5|f(x)|eps/h
			       const __m512d  two    = _mm512_set1_pd(2.0);
			       const __m512d  three  = _mm512_set1_pd(3.0);
			       const __m512d  four   = _mm512_set1_pd(4.0);
			       const __m512d  nine   = _mm512_set1_pd(9.0);
			       const __m512d  frtfv  = _mm512_set1_pd(45.0);
			       const __m512d  sixty  = _mm512_set1_pd(60.0);
			       const __m512d  svnth  = _mm512_set1_pd(0.1428571428571428571429);
			       const __m512d  epsr   = _mm512_div_pd(veps,_mm512_set1_pd(168.0));
			       __m512d        h      = _mm512_pow_pd(epsr,svnth);
			       h                     = correct_xh_zmm8r8(x,h);
			      
			       const __m512d h4      = _mm512_mul_pd(four,h);
			       const __m512d yh      = f(_mm512_add_pd(x,h));
			       const __m512d h2      = _mm512_mul_pd(two,h);
			       const __m512d ymh     = f(_mm512_sub_pd(x,h));
			       const __m512d h3      = _mm512_mul_pd(three,h);
			       const __m512d y1      = _mm512_sub_pd(yh,ymh);
			       const __m512d y2      = _mm512_sub_pd(f(_mm512_sub_pd(x,h2)),
			                                            f(_mm512_add_pd(x,h2)));
			       const __m512d y3      = _mm512_sub_pd(f(_mm512_add_pd(x,h3)),
			                                            f(_mm512_sub_pd(x,h3)));
			       if(cmperr) {

                                   // Mathematica code to generate fd scheme for 7th derivative:
                                   // Sum[(-1)^i*Binomial[7, i]*(f[x+(3-i)*h] + f[x+(4-i)*h])/2, {i, 0, 7}]
                                   // Mathematica to demonstrate that this is a finite difference formula for 7th derivative:
                                   // Series[(f[x+4*h]-f[x-4*h] + 6*(f[x-3*h] - f[x+3*h]) + 14*(f[x-h] - f[x+h] + f[x+2*h] - f[x-2*h]))/2, {h, 0, 15}]
				   const __m512d half= _mm512_set1_pd(0.5);
				   
				   const __m512d y4  = _mm512_sub_pd(f(_mm512_add_pd(x,h4)),
				                                    f(_mm512_sub_pd(x,h4)));
				   const __m512d t0  = _mm512_fmsub_pd(_mm512_set1_pd(6.0d),y3,
				                                      _mm512_set1_pd(14.0d));
				   const __m512d t1  = _mm512_fmsub_pd(t0,y1,_mm512_mul_pd(
				                                            _mm512_set1_pd(14.0),y2));
				   const __m512d y7  = _mm512_mul_pd(half,_mm512_sub_pd(y4,t1));
				   const __m512d t2  = _mm512_div_pd(_mm512_abs_pd(y7),
				                                    _mm512_mul_pd(_mm512_set1_pd(140.0),h));
				   
				   const __m512d t3  = _mm512_div_pd(_mm512_mul_pd(_mm512_set1_pd(5.0),
				                                    _mm512_mul_pd(_mm512_add_pd(
								                  _mm512_abs_pd(yh),
										  _mm512_abs_pd(yhm)),veps)),h);
				   err              = _mm512_add_pd(t2,t3);
				                                    
			       }                

			       const __m512d t0 = _mm512_mul_pd(sixty,h);
			       const __m512d t1 = _mm512_fmadd_pd(_mm512_fmadd_pd(y3,nine,y2),frtfv,y1);
			       return (_mm512_div_pd(t1,t0));
			       
		       }
			
    }

]

















#endif /*__GMS_FDIFF1246_AVX512_HPP__*/
